#Table 1
assetcapmerged_sub<-data.frame()
for (s in samplestklist) {
  assetcapmerged_sub<-rbind(assetcapmerged_sub,subset(assetcapmerged,cusip==s))
}
rm(s)

#pairwise combination summary statistics
ss_numstkpairfund<-data.frame()
ss_numstkpairfund<-rbind(ss_numstkpairfund,data.frame(Period="Full Sample",Stocks=length(unique(assetcapmerged_sub$cusip)),Pairs=length(unique(PCsharesmerged$pairno)),Funds=length(unique(assetcapmerged_sub$fundno))))

ssacmexgfc<-subset(assetcapmerged_sub, substr(assetcapmerged_sub$fdate,start = 1,stop = 6) <= 200705 | substr(assetcapmerged_sub$fdate,start = 1,stop = 6) >= 200907)
sspcexgfc<-subset(PCsharesmerged, substr(PCsharesmerged$fdate,start = 1,stop = 6) <= 200705 | substr(PCsharesmerged$fdate,start = 1,stop = 6) >= 200907)
ss_numstkpairfund<-rbind(ss_numstkpairfund,data.frame(Period="Excluding GFC",Stocks=length(unique(ssacmexgfc$cusip)),Pairs=length(unique(sspcexgfc$pairno)),Funds=length(unique(ssacmexgfc$fundno))))

ssacmgfc<-subset(assetcapmerged_sub, substr(assetcapmerged_sub$fdate,start = 1,stop = 6) >= 200706 &  substr(assetcapmerged_sub$fdate,start = 1,stop = 6) <= 200906)
sspcgfc<-subset(PCsharesmerged, substr(PCsharesmerged$fdate,start = 1,stop = 6) >= 200706 & substr(PCsharesmerged$fdate,start = 1,stop = 6) <= 200906)
ss_numstkpairfund<-rbind(ss_numstkpairfund,data.frame(Period="GFC",Stocks=length(unique(ssacmgfc$cusip)),Pairs=length(unique(sspcgfc$pairno)),Funds=length(unique(ssacmgfc$fundno))))

rm(ssacmexgfc,sspcexgfc,ssacmgfc,sspcgfc)


spf<-data.frame()
fps<-data.frame()
for (q in unique(assetcapmerged_sub$fdate)){
  ssacm<-subset(assetcapmerged_sub,fdate==q)
  for(f in unique(ssacm$fundno)){
    spf<-rbind(spf,data.frame(fdate=q,fundno=f,nostk=nrow(subset(ssacm,fundno==f)),vstk=sum(subset(ssacm,fundno==f)$shares*subset(ssacm,fundno==f)$prc) ))
  }
  for(s in unique(ssacm$cusip)){
    fps<-rbind(fps,data.frame(fdate=q,cusip=s,nofund=nrow(subset(ssacm,cusip==s)),vstk=sum(subset(ssacm,cusip==s)$shares*subset(ssacm,cusip==s)$prc) ))
  }
}
ss_stkfund<-data.frame()
ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each fund",Period="Full Sample",Mean_num=mean(spf$nostk),Median_num=median(spf$nostk),SD_num=sd(spf$nostk),Min_num=min(spf$nostk),Max_num=max(spf$nostk),
                                        Mean_val=mean(spf$vstk),Median_val=median(spf$vstk),SD_val=sd(spf$vstk),Min_val=min(spf$vstk),Max_val=max(spf$vstk)))
ssspfexgfc<-subset(spf,substr(spf$fdate,start = 1,stop = 6) <= 200705 | substr(spf$fdate,start = 1,stop = 6) >= 200907)
ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each fund",Period="Excluding GFC",Mean_num=mean(ssspfexgfc$nostk),Median_num=median(ssspfexgfc$nostk),SD_num=sd(ssspfexgfc$nostk),Min_num=min(ssspfexgfc$nostk),Max_num=max(ssspfexgfc$nostk),
                                        Mean_val=mean(ssspfexgfc$vstk),Median_val=median(ssspfexgfc$vstk),SD_val=sd(ssspfexgfc$vstk),Min_val=min(ssspfexgfc$vstk),Max_val=max(ssspfexgfc$vstk)))
ssspfgfc<-subset(spf,substr(spf$fdate,start = 1,stop = 6) >= 200706 & substr(spf$fdate,start = 1,stop = 6) <= 200906)
ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each fund",Period="GFC",Mean_num=mean(ssspfgfc$nostk),Median_num=median(ssspfgfc$nostk),SD_num=sd(ssspfgfc$nostk),Min_num=min(ssspfgfc$nostk),Max_num=max(ssspfgfc$nostk),
                                        Mean_val=mean(ssspfgfc$vstk),Median_val=median(ssspfgfc$vstk),SD_val=sd(ssspfgfc$vstk),Min_val=min(ssspfgfc$vstk),Max_val=max(ssspfgfc$vstk)))

ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each stock",Period="Full Sample",Mean_num=mean(fps$nofund),Median_num=median(fps$nofund),SD_num=sd(fps$nofund),Min_num=min(fps$nofund),Max_num=max(fps$nofund),
                                        Mean_val=mean(fps$vstk),Median_val=median(fps$vstk),SD_val=sd(fps$vstk),Min_val=min(fps$vstk),Max_val=max(fps$vstk)))
ssfpsexgfc<-subset(fps,substr(fps$fdate,start = 1,stop = 6) <= 200705 | substr(fps$fdate,start = 1,stop = 6) >= 200907)
ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each stock",Period="Excluding GFC",Mean_num=mean(ssfpsexgfc$nofund),Median_num=median(ssfpsexgfc$nofund),SD_num=sd(ssfpsexgfc$nofund),Min_num=min(ssfpsexgfc$nofund),Max_num=max(ssfpsexgfc$nofund),
                                        Mean_val=mean(ssfpsexgfc$vstk),Median_val=median(ssfpsexgfc$vstk),SD_val=sd(ssfpsexgfc$vstk),Min_val=min(ssfpsexgfc$vstk),Max_val=max(ssfpsexgfc$vstk)))
ssfpsgfc<-subset(fps,substr(fps$fdate,start = 1,stop = 6) >= 200706 & substr(fps$fdate,start = 1,stop = 6) <= 200906)
ss_stkfund<-rbind(ss_stkfund,data.frame(Variable="Each stock",Period="GFC",Mean_num=mean(ssfpsgfc$nofund),Median_num=median(ssfpsgfc$nofund),SD_num=sd(ssfpsgfc$nofund),Min_num=min(ssfpsgfc$nofund),Max_num=max(ssfpsgfc$nofund),
                                        Mean_val=mean(ssfpsgfc$vstk),Median_val=median(ssfpsgfc$vstk),SD_val=sd(ssfpsgfc$vstk),Min_val=min(ssfpsgfc$vstk),Max_val=max(ssfpsgfc$vstk)))

rm(spf,fps,q,f,s,ssspfexgfc,ssfpsgfc,ssacm)

#pair summary
ppf<-data.frame()
pps<-data.frame()
cpp<-data.frame()
for (q in unique(PCsharesmerged$fdate)){
  sspc<-subset(PCsharesmerged,fdate==q)
  for(f in unique(sspc$fundno)){
    ppf<-rbind(ppf,data.frame(fdate=q,fundno=f,nopair=nrow(subset(sspc,fundno==f))))
  }
  for(s in union(unique(sspc$stki),unique(sspc$stkj))){
    pps<-rbind(pps,data.frame(fdate=q,cusip=s,nopair=nrow(rbind(subset(sspc,stki==s),subset(sspc,stkj==s)))))
  }
  for(p in unique(sspc$pair)){
    cpp<-rbind(cpp,data.frame(fdate=q,pair=p,sumfcap=sum(subset(sspc,pair==p)$FCAP)))
  }
}
ss_pair<-data.frame()
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per fund",Period="Full Sample",Mean=mean(ppf$nopair),Median=median(ppf$nopair),SD=sd(ppf$nopair),Min=min(ppf$nopair),Max=max(ppf$nopair)))
ssppf19<-subset(ppf,substr(ppf$fdate,start = 1,stop = 6) <= 200705 | substr(ppf$fdate,start = 1,stop = 6) >= 200907)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per fund",Period="Excluding GFC",Mean=mean(ssppf19$nopair),Median=median(ssppf19$nopair),SD=sd(ssppf19$nopair),Min=min(ssppf19$nopair),Max=max(ssppf19$nopair)))
ssppf22<-subset(ppf,substr(ppf$fdate,start = 1,stop = 6) >= 200706 & substr(ppf$fdate,start = 1,stop = 6) <= 200906)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per fund",Period="GFC",Mean=mean(ssppf22$nopair),Median=median(ssppf22$nopair),SD=sd(ssppf22$nopair),Min=min(ssppf22$nopair),Max=max(ssppf22$nopair)))

ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per stock",Period="Full Sample",Mean=mean(pps$nopair),Median=median(pps$nopair),SD=sd(pps$nopair),Min=min(pps$nopair),Max=max(pps$nopair)))
sspps19<-subset(pps,substr(pps$fdate,start = 1,stop = 6) <= 200705 | substr(pps$fdate,start = 1,stop = 6) >= 200907)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per stock",Period="Excluding GFC",Mean=mean(sspps19$nopair),Median=median(sspps19$nopair),SD=sd(sspps19$nopair),Min=min(sspps19$nopair),Max=max(sspps19$nopair)))
sspps22<-subset(pps,substr(pps$fdate,start = 1,stop = 6) >= 200706 & substr(pps$fdate,start = 1,stop = 6) <= 200906)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairs per stock",Period="GFC",Mean=mean(sspps22$nopair),Median=median(sspps22$nopair),SD=sd(sspps22$nopair),Min=min(sspps22$nopair),Max=max(sspps22$nopair)))

ss_pair<-rbind(ss_pair,data.frame(Variable="Pairwise Connectedness",Period="Full Sample",Mean=mean(cpp$sumfcap),Median=median(cpp$sumfcap),SD=sd(cpp$sumfcap),Min=min(cpp$sumfcap),Max=max(cpp$sumfcap)))
sscpp19<-subset(cpp,substr(cpp$fdate,start = 1,stop = 6) <= 200705 | substr(cpp$fdate,start = 1,stop = 6) >= 200907)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairwise Connectedness",Period="Excluding GFC",Mean=mean(sscpp19$sumfcap),Median=median(sscpp19$sumfcap),SD=sd(sscpp19$sumfcap),Min=min(sscpp19$sumfcap),Max=max(sscpp19$sumfcap)))
sscpp22<-subset(cpp,substr(cpp$fdate,start = 1,stop = 6) >= 200706 & substr(cpp$fdate,start = 1,stop = 6) <= 200906)
ss_pair<-rbind(ss_pair,data.frame(Variable="Pairwise Connectedness",Period="GFC",Mean=mean(sscpp22$sumfcap),Median=median(sscpp22$sumfcap),SD=sd(sscpp22$sumfcap),Min=min(sscpp22$sumfcap),Max=max(sscpp22$sumfcap)))

rm(ppf,pps,cpp,q,f,s,p,ssppf19,ssppf22,sspps19,sspps22,sscpp19,sscpp22,sspc)














#Table 2
logreg_stdfcapcen$logstkfcap<-log(logreg_stdfcapcen$stkfcap)
logreg_stdfcapcen$logeigcenwgt<-log(logreg_stdfcapcen$eigcenwgt)
logreg_stdfcapcen$logeigcenwgt_normstdfcap<-log(logreg_stdfcapcen$eigcenwgt_normstdfcap)
logreg_stdfcapcen$logeigcenwgt_normstdrankfcap<-log(logreg_stdfcapcen$eigcenwgt_normstdrankfcap)

logreg_stdfcapcen[sapply(logreg_stdfcapcen, is.infinite)] <- NA

library(plm)
logreg_stdfcapcen <- pdata.frame(logreg_stdfcapcen, index = c("cusip","quarter"))

library(dplyr)
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L1 = lag(logeigcenwgt_normstdrankfcap, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L2 = lag(logeigcenwgt_normstdrankfcap, n = 2))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L3 = lag(logeigcenwgt_normstdrankfcap, n = 3))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L1 = lag(logamihudsc, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L2 = lag(logamihudsc, n = 2))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L3 = lag(logamihudsc, n = 3))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(angvol_L1 = lag(angvol, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(vix_L1 = lag(VIX, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(dm1_L1 = lag(dm1, n = 1))

#univariate regressions
bipanelreg_result<-data.frame()
model <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 + logeigcenwgt_normstdrankfcap_L2 + logeigcenwgt_normstdrankfcap_L3, data = logreg_stdfcapcen, model = "within")
bipanelreg_result<-rbind(bipanelreg_result,data.frame(scaling="amihud~connect",alpha=within_intercept(model)[1],
                                                      stderr_alpha=attr(within_intercept(model), "se"),t_a=within_intercept(model)[1]/attr(within_intercept(model), "se"),
                                                      beta1=summary(model)$coefficients[1,1],stderr_beta1=summary(model)$coefficients[1,2],p_b1=summary(model)$coefficients[1,4],
                                                      beta2=summary(model)$coefficients[2,1],stderr_beta2=summary(model)$coefficients[2,2],p_b2=summary(model)$coefficients[2,4],
                                                      beta3=summary(model)$coefficients[3,1],stderr_beta3=summary(model)$coefficients[3,2],p_b3=summary(model)$coefficients[3,4],
                                                      beta_angvol="NA",stderr_beta_angvol="NA",p_b_angvol="NA",
                                                      beta_VIX="NA",stderr_beta_VIX="NA",p_b_VIX="NA",
                                                      beta_dm1="NA",stderr_beta_dm1="NA",p_b_dm1="NA",
                                                      adjrsq=r.squared(model, dfcor = TRUE)
))

model <- plm(logeigcenwgt_normstdrankfcap ~ logamihudsc_L1 +logamihudsc_L2 +logamihudsc_L3, data = logreg_stdfcapcen, model = "within")
bipanelreg_result<-rbind(bipanelreg_result,data.frame(scaling="connect~amihud",alpha=within_intercept(model)[1],
                                                      stderr_alpha=attr(within_intercept(model), "se"),t_a=within_intercept(model)[1]/attr(within_intercept(model), "se"),
                                                      beta1=summary(model)$coefficients[1,1],stderr_beta1=summary(model)$coefficients[1,2],p_b1=summary(model)$coefficients[1,4],
                                                      beta2=summary(model)$coefficients[2,1],stderr_beta2=summary(model)$coefficients[2,2],p_b2=summary(model)$coefficients[2,4],
                                                      beta3=summary(model)$coefficients[3,1],stderr_beta3=summary(model)$coefficients[3,2],p_b3=summary(model)$coefficients[3,4],
                                                      beta_angvol="NA",stderr_beta_angvol="NA",p_b_angvol="NA",
                                                      beta_VIX="NA",stderr_beta_VIX="NA",p_b_VIX="NA",
                                                      beta_dm1="NA",stderr_beta_dm1="NA",p_b_dm1="NA",
                                                      adjrsq=r.squared(model, dfcor = TRUE)
))

logreg_stdfcapcen$dm1_L1_sc<-logreg_stdfcapcen$dm1_L1/1000

#multivariate regressions
model <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 + logeigcenwgt_normstdrankfcap_L2 + logeigcenwgt_normstdrankfcap_L3 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within")
bipanelreg_result<-rbind(bipanelreg_result,data.frame(scaling="amihud~connect",alpha=within_intercept(model)[1],
                                                      stderr_alpha=attr(within_intercept(model), "se"),t_a=within_intercept(model)[1]/attr(within_intercept(model), "se"),
                                                      beta1=summary(model)$coefficients[1,1],stderr_beta1=summary(model)$coefficients[1,2],p_b1=summary(model)$coefficients[1,4],
                                                      beta2=summary(model)$coefficients[2,1],stderr_beta2=summary(model)$coefficients[2,2],p_b2=summary(model)$coefficients[2,4],
                                                      beta3=summary(model)$coefficients[3,1],stderr_beta3=summary(model)$coefficients[3,2],p_b3=summary(model)$coefficients[3,4],
                                                      beta_angvol=summary(model)$coefficients[4,1],stderr_beta_angvol=summary(model)$coefficients[4,2],p_b_angvol=summary(model)$coefficients[4,4],
                                                      beta_VIX=summary(model)$coefficients[5,1],stderr_beta_VIX=summary(model)$coefficients[5,2],p_b_VIX=summary(model)$coefficients[5,4],
                                                      beta_dm1=summary(model)$coefficients[6,1],stderr_beta_dm1=summary(model)$coefficients[6,2],p_b_dm1=summary(model)$coefficients[6,4],
                                                      adjrsq=r.squared(model, dfcor = TRUE)
))

model <- plm(logeigcenwgt_normstdrankfcap ~ logamihudsc_L1 +logamihudsc_L2 +logamihudsc_L3 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within")
bipanelreg_result<-rbind(bipanelreg_result,data.frame(scaling="connect~amihud",alpha=within_intercept(model)[1],
                                                      stderr_alpha=attr(within_intercept(model), "se"),t_a=within_intercept(model)[1]/attr(within_intercept(model), "se"),
                                                      beta1=summary(model)$coefficients[1,1],stderr_beta1=summary(model)$coefficients[1,2],p_b1=summary(model)$coefficients[1,4],
                                                      beta2=summary(model)$coefficients[2,1],stderr_beta2=summary(model)$coefficients[2,2],p_b2=summary(model)$coefficients[2,4],
                                                      beta3=summary(model)$coefficients[3,1],stderr_beta3=summary(model)$coefficients[3,2],p_b3=summary(model)$coefficients[3,4],
                                                      beta_angvol=summary(model)$coefficients[4,1],stderr_beta_angvol=summary(model)$coefficients[4,2],p_b_angvol=summary(model)$coefficients[4,4],
                                                      beta_VIX=summary(model)$coefficients[5,1],stderr_beta_VIX=summary(model)$coefficients[5,2],p_b_VIX=summary(model)$coefficients[5,4],
                                                      beta_dm1=summary(model)$coefficients[6,1],stderr_beta_dm1=summary(model)$coefficients[6,2],p_b_dm1=summary(model)$coefficients[6,4],
                                                      adjrsq=r.squared(model, dfcor = TRUE)
))
rm(model)














#Table 3
#mutual fund total financial asset as IV
logreg_stdfcapcen$logstkfcap<-log(logreg_stdfcapcen$stkfcap)
logreg_stdfcapcen$logeigcenwgt<-log(logreg_stdfcapcen$eigcenwgt)
logreg_stdfcapcen$logeigcenwgt_normstdfcap<-log(logreg_stdfcapcen$eigcenwgt_normstdfcap)
logreg_stdfcapcen$logeigcenwgt_normstdrankfcap<-log(logreg_stdfcapcen$eigcenwgt_normstdrankfcap)

logreg_stdfcapcen[sapply(logreg_stdfcapcen, is.infinite)] <- NA

library(plm)
logreg_stdfcapcen <- pdata.frame(logreg_stdfcapcen, index = c("cusip","quarter"))

library(dplyr)
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L1 = lag(logeigcenwgt_normstdrankfcap, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L2 = lag(logeigcenwgt_normstdrankfcap, n = 2))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logeigcenwgt_normstdrankfcap_L3 = lag(logeigcenwgt_normstdrankfcap, n = 3))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L1 = lag(logamihudsc, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L2 = lag(logamihudsc, n = 2))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(logamihudsc_L3 = lag(logamihudsc, n = 3))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(angvol_L1 = lag(angvol, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(vix_L1 = lag(VIX, n = 1))
logreg_stdfcapcen <- logreg_stdfcapcen %>%
  group_by(cusip) %>%
  mutate(dm1_L1 = lag(dm1, n = 1))

#import raw_mf_tfa_quarter

#import to the panel data
logreg_stdfcapcen$mftfa<-rep(raw_mf_tfa_quarter$mftfa,608)
logreg_stdfcapcen$mftfa_L1<-rep(c(NA,raw_mf_tfa_quarter$mftfa[-length(raw_mf_tfa_quarter$mftfa)]),608)

rm(raw_mf_tfa_quarter)

library(plm)
#Durbin-Wu-Hausman test
model_fe <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1, data = logreg_stdfcapcen, model = "within")
model_iv <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 | mftfa_L1, data = logreg_stdfcapcen, model = "within",inst.method = "bvk")
phtest(model_fe,model_iv)

summary(model_fe)
summary(model_iv)

rm(model_fe,model_iv)


#with controls
#Durbin-Wu-Hausman test
model_fe <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within")
model_iv <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 +angvol_L1+vix_L1+dm1_L1| mftfa_L1 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within",inst.method = "bvk")
phtest(model_fe,model_iv)

summary(model_fe)
summary(model_iv)

rm(model_fe,model_iv)




#index inclusion as IV
#import SPX_inclusion
logreg_stdfcapcen$spxinc<-NA
#create dummy
for (q in quarterlist) {
  for (s in subset(logreg_stdfcapcen,quarter==q)$cusip) {
    logreg_stdfcapcen[logreg_stdfcapcen$cusip == s & logreg_stdfcapcen$quarter == q, ]$spxinc<-0
    if (s %in% subset(SPX_inclusion,quarter==q)$cusip) {
      logreg_stdfcapcen[logreg_stdfcapcen$cusip == s & logreg_stdfcapcen$quarter == q, ]$spxinc<-1
    }
  }
}
rm(q,s)

library(plm)
logreg_stdfcapcen <- pdata.frame(logreg_stdfcapcen, index = c("cusip", "qrank"))
logreg_stdfcapcen$spxinc_L1<-lag(logreg_stdfcapcen$spxinc,k=1)

#Durbin-Wu-Hausman test
model_fe <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1, data = logreg_stdfcapcen, model = "within")
model_iv <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 | spxinc_L1, data = logreg_stdfcapcen, model = "within",inst.method = "bvk")
phtest(model_fe,model_iv)

summary(model_fe)
summary(model_iv)

rm(model_fe,model_iv)

#with controls
#Durbin-Wu-Hausman test
model_fe <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within")
model_iv <- plm(logamihudsc ~ logeigcenwgt_normstdrankfcap_L1 +angvol_L1+vix_L1+dm1_L1| spxinc_L1 +angvol_L1+vix_L1+dm1_L1, data = logreg_stdfcapcen, model = "within",inst.method = "bvk")
phtest(model_fe,model_iv)

summary(model_fe)
summary(model_iv)

rm(model_fe,model_iv)













#Table 4
sort_connectedness<-subset(logreg_stdfcapcen,select = c("cusip","qrank","quarter","amihud_sc","eigcenwgt_normstdrankfcap","qlogxret","ffc4alpha","size"))

decile<-data.frame()
for (q in unique(sort_connectedness$qrank)) {
  ss<-subset(sort_connectedness,qrank==q)
  ss<-ss[!is.na(ss$eigcenwgt_normstdrankfcap),]
  ss$fcap_decile<-cut(ss$eigcenwgt_normstdrankfcap,breaks=quantile(ss$eigcenwgt_normstdrankfcap,probs=seq(0,1,by=0.25)),include.lowest=TRUE,labels=FALSE)
  decile<-rbind(decile,ss)
}
rm(q,ss)
temp<-subset(decile,select = c("cusip","qrank","fcap_decile"))
rm(decile)
sort_connectedness<-merge(sort_connectedness,temp,by=c("cusip","qrank"),all.x = T)
rm(temp)

sortconnect_result<-data.frame()
for (qc in 1:4) {
  ss<-na.omit(subset(sort_connectedness,fcap_decile==qc))
  avgami<-median(ss$amihud_sc)
  avgxret<-median(ss$qlogxret)
  #quarterly alpha
  avgalpha<-median(ss$ffc4alpha)*63
  avgsize<-median(ss$size)
  sortconnect_result<-rbind(sortconnect_result,data.frame(q_connectedness=qc,amihud_sc_median=avgami,xret_median=avgxret,alpha_median=avgalpha,size_median=avgsize))
}
rm(qc,ss,avgami,avgxret,avgalpha,avgsize)

hml_amihud_sc<-subset(sort_connectedness,fcap_decile==4)$amihud_sc-subset(sort_connectedness,fcap_decile==1)$amihud_sc
t.test(hml_amihud_sc)
hml_xret<-subset(sort_connectedness,fcap_decile==4)$qlogxret-subset(sort_connectedness,fcap_decile==1)$qlogxret
t.test(hml_xret)
hml_alpha<-subset(sort_connectedness,fcap_decile==4)$ffc4alpha-subset(sort_connectedness,fcap_decile==1)$ffc4alpha
t.test(hml_alpha)
hml_size<-subset(sort_connectedness,fcap_decile==4)$size-subset(sort_connectedness,fcap_decile==1)$size
t.test(hml_size)
rm(hml_amihud_sc,hml_xret,hml_alpha,hml_size)

#t.test(na.omit(subset(sort_connectedness,fcap_decile==4))$ffc4alpha)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==3))$ffc4alpha)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==2))$ffc4alpha)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==1))$ffc4alpha)

#t.test(na.omit(subset(sort_connectedness,fcap_decile==4))$qlogxret)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==3))$qlogxret)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==2))$qlogxret)
#t.test(na.omit(subset(sort_connectedness,fcap_decile==1))$qlogxret)











#Table 9
bacsort<-data.frame()
for (q in 1:94) {
  ss<-subset(logreg_stdfcapcen_ffc4,qrank==q)
  ss<-na.omit(ss,"eigcenwgt_normstdrankfcap")
  ss<-ss[order(ss$eigcenwgt_normstdrankfcap),]
  if (nrow(ss)>1) {
    ss$eigcenrank<-c(1:nrow(ss))
  }
  median_eigcen <- median(ss$eigcenwgt_normstdrankfcap, na.rm = TRUE)
  ss$half <- ifelse(ss$eigcenwgt_normstdrankfcap <= median_eigcen, 1, 2)
  ss$quartile <- cut(ss$eigcenwgt_normstdrankfcap, breaks = quantile(ss$eigcenwgt_normstdrankfcap, probs = seq(0, 1, 0.25), na.rm = TRUE), labels = FALSE, include.lowest = TRUE)
  ss$eigcenrank<-as.numeric(ss$eigcenrank)
  avgrank<-mean(ss$eigcenrank)
  newss<-data.frame()
  for (h in 1:2) {
    ssss<-subset(ss,half==h)
    ssss$weight<-(2/sum(abs(ss$eigcenrank-avgrank)))*-(ssss$eigcenrank-avgrank)
    newss<-rbind(newss,ssss)
  }
  bacsort<-rbind(bacsort,newss)
}
rm(ss,median_eigcen,avgrank,newss,ssss,q,h)

bacquartile <- data.frame()
for (q in 2:94) {
  ss <- subset(bacsort, qrank == q)
  cusip_subset1 <- subset(bacsort, qrank == q  & quartile==1)$cusip
  cusip_subset2 <- subset(bacsort, qrank == q  & quartile==4)$cusip
  r_bac<- -mean(subset(ss, quartile == 1)$qxret)+mean(subset(ss, quartile == 4)$qxret)
  r_d1 <- ifelse(length(subset(ss, quartile == 1)$qxret) > 0, mean(subset(ss, quartile == 1)$qxret, na.rm = T), NA)
  r_d2 <- ifelse(length(subset(ss, quartile == 2)$qxret) > 0, mean(subset(ss, quartile == 2)$qxret, na.rm = T), NA)
  r_d3 <- ifelse(length(subset(ss, quartile == 3)$qxret) > 0, mean(subset(ss, quartile == 3)$qxret, na.rm = T), NA)
  r_d4 <- ifelse(length(subset(ss, quartile == 4)$qxret) > 0, mean(subset(ss, quartile == 4)$qxret, na.rm = T), NA)
  bacquartile <- rbind(bacquartile, data.frame(qrank = q, bac = r_bac, p1 = r_d1, p2 = r_d2, p3 = r_d3, p4 = r_d4))
}
rm(q,ss,cusip_subset1,cusip_subset2,r_bac,r_d1,r_d2,r_d3,r_d4)

bacquartile$bac[is.na(bacquartile$bac)]<-mean(bacquartile$bac,na.rm = T)
bacquartile$p1[is.na(bacquartile$p1)]<-mean(bacquartile$p1,na.rm = T)
bacquartile$p2[is.na(bacquartile$p2)]<-mean(bacquartile$p2,na.rm = T)
bacquartile$p3[is.na(bacquartile$p3)]<-mean(bacquartile$p3,na.rm = T)
bacquartile$p4[is.na(bacquartile$p4)]<-mean(bacquartile$p4,na.rm = T)

bacquartile<-merge(bacquartile,ffc4_quarter,by="qrank",all.x=T)

#regression
bacquartile$mktrf<-bacquartile$mktrf/4
bacquartile$smb<-bacquartile$smb/4
bacquartile$hml<-bacquartile$hml/4
bacquartile$mom<-bacquartile$mom/4

bacregresult<-data.frame()
model<-lm(bac~mktrf+smb+hml+mom,data=bacquartile)
bacregresult<-rbind(bacregresult,data.frame(regression="bac",alpha=summary(model)$coefficients[1,1],stderr_alpha=summary(model)$coefficients[1,2],
                                            pvalue_alpha=summary(model)$coefficients[1,4],beta=summary(model)$coefficients[2,1],
                                            stderr_beta=summary(model)$coefficients[2,2],pvalue_beta=summary(model)$coefficients[2,4],
                                            adjrsq=summary(model)$adj.r.squared,ar=summary(model)$coefficients[1,1]/summary(model)$sigma*sqrt(12)))

model<-lm(p1~mktrf+smb+hml+mom,data=bacquartile)
bacregresult<-rbind(bacregresult,data.frame(regression="p1",alpha=summary(model)$coefficients[1,1],stderr_alpha=summary(model)$coefficients[1,2],
                                            pvalue_alpha=summary(model)$coefficients[1,4],beta=summary(model)$coefficients[2,1],
                                            stderr_beta=summary(model)$coefficients[2,2],pvalue_beta=summary(model)$coefficients[2,4],
                                            adjrsq=summary(model)$adj.r.squared,ar=summary(model)$coefficients[1,1]/summary(model)$sigma*sqrt(12)))

model<-lm(p2~mktrf+smb+hml+mom,data=bacquartile)
bacregresult<-rbind(bacregresult,data.frame(regression="p2",alpha=summary(model)$coefficients[1,1],stderr_alpha=summary(model)$coefficients[1,2],
                                            pvalue_alpha=summary(model)$coefficients[1,4],beta=summary(model)$coefficients[2,1],
                                            stderr_beta=summary(model)$coefficients[2,2],pvalue_beta=summary(model)$coefficients[2,4],
                                            adjrsq=summary(model)$adj.r.squared,ar=summary(model)$coefficients[1,1]/summary(model)$sigma*sqrt(12)))

model<-lm(p3~mktrf+smb+hml+mom,data=bacquartile)
bacregresult<-rbind(bacregresult,data.frame(regression="p3",alpha=summary(model)$coefficients[1,1],stderr_alpha=summary(model)$coefficients[1,2],
                                            pvalue_alpha=summary(model)$coefficients[1,4],beta=summary(model)$coefficients[2,1],
                                            stderr_beta=summary(model)$coefficients[2,2],pvalue_beta=summary(model)$coefficients[2,4],
                                            adjrsq=summary(model)$adj.r.squared,ar=summary(model)$coefficients[1,1]/summary(model)$sigma*sqrt(12)))

model<-lm(p4~mktrf+smb+hml+mom,data=bacquartile)
bacregresult<-rbind(bacregresult,data.frame(regression="p4",alpha=summary(model)$coefficients[1,1],stderr_alpha=summary(model)$coefficients[1,2],
                                            pvalue_alpha=summary(model)$coefficients[1,4],beta=summary(model)$coefficients[2,1],
                                            stderr_beta=summary(model)$coefficients[2,2],pvalue_beta=summary(model)$coefficients[2,4],
                                            adjrsq=summary(model)$adj.r.squared,ar=summary(model)$coefficients[1,1]/summary(model)$sigma*sqrt(12)))
rm(model)

